El presente documento constituye el análisis exploratorio de datos de defunciones registradas en Guatemala durante el período 2011-2021. Los datos provienen del Instituto Nacional de Estadística (INE) y contienen información detallada sobre cada defunción registrada oficialmente en el país.
archivos <- c(
"lSg2Lmx2sWne8RcD9hM6guU73I9eQW8B.csv",
"QnZLxknSHwbFfpJ8I1frbkoIKfz1BBjd.csv",
"iY2sN6q3d4ihJpgzr7KgJpQAiEn0bo60.csv",
"DX2BmYU5m4JfPRhrFHwDRDEs49V7fN5I.csv",
"bb6vENc1cmPlBToSEEr6HESWdZk6tHFs.csv",
"20171204152107xRp35JuZin7nN2x88Me8MVcQvyZCnu5K.csv",
"20181226142907xRp35JuZin7nN2x88Me8MVcQvyZCnu5K.csv",
"201911291520069Odm3oxU9mTY58hkborwzylm7MJop05q.csv",
"20201201154851el8puh8r6zutgVKBoRIbazWluzIr25A3.csv",
"20210930225530FopQpWf6BcBWj8taVS3Q3mRKxgDsvwPe.csv"
)
datos_raw <- archivos %>%
map_dfr(~ read_csv(.x, col_types = cols(.default = "c"))) %>%
clean_names()
cat("Datos cargados:", nrow(datos_raw), "observaciones ×", ncol(datos_raw), "variables\n")## Datos cargados: 809296 observaciones × 35 variables
data.frame(
Característica = c("Observaciones", "Variables", "Período"),
Valor = c(format(nrow(datos_raw), big.mark = ","),
ncol(datos_raw),
"2011-2021")
) %>%
kable(caption = "Dimensiones del Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Característica | Valor |
|---|---|
| Observaciones | 809,296 |
| Variables | 35 |
| Período | 2011-2021 |
vars_numericas_esperadas <- c("anoreg", "mesreg", "diaocu", "edadif")
vars_categoricas <- setdiff(names(datos_raw), vars_numericas_esperadas)
data.frame(
Tipo = c("Numéricas", "Categóricas", "Total"),
Cantidad = c(length(vars_numericas_esperadas), length(vars_categoricas), ncol(datos_raw))
) %>%
kable(caption = "Tipos de Variables") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Tipo | Cantidad |
|---|---|
| Numéricas | 4 |
| Categóricas | 31 |
| Total | 35 |
tribble(
~Variable, ~Descripción,
"anoreg", "Año de registro",
"mesreg", "Mes de registro",
"depreg", "Departamento de registro",
"depocu", "Departamento de ocurrencia",
"sexo", "Sexo del fallecido",
"edadif", "Edad del fallecido",
"perdif", "Período de edad",
"caudef", "Código CIE-10",
"asist", "Asistencia médica",
"ocur", "Lugar de ocurrencia"
) %>%
kable(caption = "Diccionario de Variables Principales") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Variable | Descripción |
|---|---|
| anoreg | Año de registro |
| mesreg | Mes de registro |
| depreg | Departamento de registro |
| depocu | Departamento de ocurrencia |
| sexo | Sexo del fallecido |
| edadif | Edad del fallecido |
| perdif | Período de edad |
| caudef | Código CIE-10 |
| asist | Asistencia médica |
| ocur | Lugar de ocurrencia |
na_summary <- datos_raw %>%
summarise(across(everything(), ~sum(is.na(.) | . == "" | . == " ") / n() * 100)) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Porcentaje_NA") %>%
arrange(desc(Porcentaje_NA)) %>%
filter(Porcentaje_NA > 0)
na_summary %>%
head(15) %>%
mutate(Porcentaje_NA = paste0(round(Porcentaje_NA, 2), "%")) %>%
kable(caption = "Top 15 Variables con Valores Faltantes",
col.names = c("Variable", "% Faltantes")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Variable | % Faltantes |
|---|---|
| mupreg | 91.06% |
| mupocu | 91.06% |
| anoocu | 91.06% |
| caudef_descrip | 90.53% |
| getdif | 82.08% |
| ocudif | 82.08% |
| anoocu_2 | 37% |
| areag | 32.7% |
| puedif | 17.92% |
| ciuodif | 17.92% |
| mupreg_2 | 8.94% |
| mupocu_2 | 8.94% |
| pnadif | 8.94% |
| predif | 8.94% |
na_summary %>%
head(20) %>%
ggplot(aes(x = reorder(Variable, Porcentaje_NA), y = Porcentaje_NA)) +
geom_col(fill = "coral", alpha = 0.8) +
coord_flip() +
labs(title = "Variables con Valores Faltantes", x = "Variable", y = "% Faltantes") +
theme_minimal()datos <- datos_raw %>%
mutate(across(where(is.character), ~ str_to_upper(stri_trans_general(.x, "Latin-ASCII")))) %>%
mutate(across(c(anoreg, mesreg, diaocu, edadif), as.numeric)) %>%
mutate(edad_anios = case_when(
str_detect(perdif, "ANO") ~ edadif,
str_detect(perdif, "MES") ~ edadif / 12,
str_detect(perdif, "DIA") ~ edadif / 365,
str_detect(perdif, "HORA") ~ edadif / 8760,
TRUE ~ NA_real_
)) %>%
mutate(causa_capitulo = str_sub(caudef, 1, 1)) %>%
filter(edad_anios <= 115 | is.na(edad_anios))
cat("Registros después de limpieza:", format(nrow(datos), big.mark = ","), "\n")## Registros después de limpieza: 809,292
datos %>%
select(anoreg, mesreg, diaocu, edadif, edad_anios) %>%
summarise(across(where(is.numeric),
list(
N = ~sum(!is.na(.)),
Media = ~mean(., na.rm = TRUE),
Mediana = ~median(., na.rm = TRUE),
Desv_Std = ~sd(., na.rm = TRUE),
Min = ~min(., na.rm = TRUE),
Q1 = ~quantile(., 0.25, na.rm = TRUE),
Q3 = ~quantile(., 0.75, na.rm = TRUE),
Max = ~max(., na.rm = TRUE)
))) %>%
pivot_longer(everything(),
names_to = c("Variable", "Estadística"),
names_sep = "_(?=[^_]+$)",
values_to = "Valor") %>%
mutate(Valor = round(Valor, 2)) %>%
pivot_wider(names_from = Estadística, values_from = Valor) %>%
kable(caption = "Resumen estadístico de variables numéricas") %>%
kable_styling(full_width = FALSE) %>%
scroll_box(width = "100%")| Variable | N | Media | Mediana | Std | Min | Q1 | Q3 | Max |
|---|---|---|---|---|---|---|---|---|
| anoreg | 809292 | 2015.74 | 2016 | NA | 2011 | 2013 | 2018 | 2021 |
| anoreg_Desv | NA | NA | NA | 2.89 | NA | NA | NA | NA |
| mesreg | 0 | NaN | NA | NA | Inf | NA | NA | -Inf |
| mesreg_Desv | NA | NA | NA | NA | NA | NA | NA | NA |
| diaocu | 809292 | 15.67 | 16 | NA | 1 | 8 | 23 | 31 |
| diaocu_Desv | NA | NA | NA | 8.82 | NA | NA | NA | NA |
| edadif | 802793 | 54.14 | 61 | NA | 0 | 32 | 78 | 115 |
| edadif_Desv | NA | NA | NA | 28.11 | NA | NA | NA | NA |
| edad_anios | 802793 | 53.63 | 61 | NA | 0 | 32 | 78 | 115 |
| edad_anios_Desv | NA | NA | NA | 28.96 | NA | NA | NA | NA |
datos %>%
filter(!is.na(edad_anios)) %>%
summarise(
N = n(),
Media = mean(edad_anios),
Mediana = median(edad_anios),
Desv_Std = sd(edad_anios),
Q1 = quantile(edad_anios, 0.25),
Q3 = quantile(edad_anios, 0.75),
Mínimo = min(edad_anios),
Máximo = max(edad_anios)
) %>%
pivot_longer(everything(), names_to = "Estadística", values_to = "Valor") %>%
mutate(Valor = round(Valor, 2)) %>%
kable(caption = "Estadísticas de Edad") %>%
kable_styling(full_width = FALSE)| Estadística | Valor |
|---|---|
| N | 802793.00 |
| Media | 53.63 |
| Mediana | 61.00 |
| Desv_Std | 28.96 |
| Q1 | 32.00 |
| Q3 | 78.00 |
| Mínimo | 0.00 |
| Máximo | 115.00 |
datos %>%
filter(!is.na(edad_anios), !is.na(sexo)) %>%
ggplot(aes(x = edad_anios, fill = sexo)) +
geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
scale_fill_manual(values = c("HOMBRE" = "#2c7bb6", "MUJER" = "#d7191c"), name = "Sexo") +
labs(title = "Distribución de Edad por Sexo", x = "Edad (Años)", y = "Frecuencia") +
theme_minimal() +
theme(legend.position = "top")
## Normalidad (Edad)
# Shapiro en muestra (Shapiro no conviene con n muy grande)
set.seed(123)
xm <- sample(x, size = min(length(x), 5000))
shapiro.test(xm)##
## Shapiro-Wilk normality test
##
## data: xm
## W = 0.91889, p-value < 2.2e-16
datos %>%
filter(!is.na(edad_anios), !is.na(sexo)) %>%
ggplot(aes(x = sexo, y = edad_anios, fill = sexo)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
scale_fill_manual(values = c("HOMBRE" = "#2c7bb6", "MUJER" = "#d7191c")) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "yellow") +
labs(title = "Boxplot de Edad por Sexo", x = "Sexo", y = "Edad (Años)") +
theme_minimal() +
theme(legend.position = "none")datos %>%
count(anoreg) %>%
ggplot(aes(x = anoreg, y = n)) +
geom_line(color = "darkblue", size = 1.5) +
geom_point(color = "darkblue", size = 4) +
geom_text(aes(label = comma(n)), vjust = -0.8, size = 3.5) +
scale_x_continuous(breaks = seq(min(datos$anoreg, na.rm = TRUE),
max(datos$anoreg, na.rm = TRUE), 1)) +
scale_y_continuous(labels = comma) +
labs(title = "Evolución de Defunciones por Año", x = "Año", y = "Defunciones") +
theme_minimal()datos %>%
filter(!is.na(edad_anios), !is.na(anoreg)) %>%
group_by(anoreg) %>%
summarise(edad_promedio = mean(edad_anios), edad_mediana = median(edad_anios)) %>%
ggplot(aes(x = anoreg, y = edad_promedio)) +
geom_line(color = "darkgreen", size = 1.5) +
geom_point(color = "darkgreen", size = 4) +
geom_line(aes(y = edad_mediana), color = "orange", size = 1.5, linetype = "dashed") +
scale_x_continuous(breaks = seq(min(datos$anoreg, na.rm = TRUE),
max(datos$anoreg, na.rm = TRUE), 1)) +
labs(title = "Edad Promedio de Fallecimiento por Año",
subtitle = "Sólida: Media | Punteada: Mediana",
x = "Año", y = "Edad (Años)") +
theme_minimal()datos %>%
filter(!is.na(mesreg)) %>%
count(mesreg) %>%
mutate(mes_nombre = month.name[mesreg]) %>%
ggplot(aes(x = reorder(mes_nombre, mesreg), y = n)) +
geom_col(fill = "steelblue", alpha = 0.8) +
scale_y_continuous(labels = comma) +
labs(title = "Distribución por Mes de Registro", x = "Mes", y = "Frecuencia") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))tabla_sexo <- datos %>%
count(sexo) %>%
drop_na() %>%
mutate(porcentaje = paste0(round(n / sum(n) * 100, 2), "%"))
tabla_sexo %>%
kable(caption = "Distribución por Sexo",
col.names = c("Sexo", "Frecuencia", "Porcentaje"),
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = FALSE)| Sexo | Frecuencia | Porcentaje |
|---|---|---|
| HOMBRE | 454,898 | 56.21% |
| MUJER | 354,394 | 43.79% |
tabla_sexo %>%
ggplot(aes(x = sexo, y = n, fill = sexo)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = comma(n)), vjust = -0.5, size = 5) +
scale_fill_manual(values = c("HOMBRE" = "#2c7bb6", "MUJER" = "#d7191c")) +
scale_y_continuous(labels = comma) +
labs(title = "Distribución por Sexo", x = "Sexo", y = "Frecuencia") +
theme_minimal() +
theme(legend.position = "none")datos %>%
count(areag) %>%
drop_na() %>%
mutate(porcentaje = paste0(round(n / sum(n) * 100, 2), "%")) %>%
kable(caption = "Distribución por Área",
col.names = c("Área", "Frecuencia", "Porcentaje"),
format.args = list(big.mark = ",")) %>%
kable_styling(full_width = FALSE)| Área | Frecuencia | Porcentaje |
|---|---|---|
| IGNORADO | 10,171 | 1.87% |
| RURAL | 239,386 | 43.95% |
| URBANO | 295,064 | 54.18% |
datos %>%
count(asist) %>%
drop_na() %>%
arrange(desc(n)) %>%
head(10) %>%
mutate(porcentaje = paste0(round(n / sum(n) * 100, 2), "%")) %>%
kable(caption = "Top 10 - Asistencia Médica",
col.names = c("Asistencia", "Frecuencia", "Porcentaje"),
format.args = list(big.mark = ",")) %>%
kable_styling()| Asistencia | Frecuencia | Porcentaje |
|---|---|---|
| NINGUNA | 397,807 | 49.15% |
| MEDICA | 380,022 | 46.96% |
| EMPIRICA | 18,318 | 2.26% |
| IGNORADO | 9,482 | 1.17% |
| PARAMEDICA | 3,028 | 0.37% |
| COMADRONA | 635 | 0.08% |
datos %>%
count(asist) %>%
drop_na() %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(x = reorder(asist, n), y = n)) +
geom_col(fill = "darkgreen", alpha = 0.8) +
geom_text(aes(label = comma(n)), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_y_continuous(labels = comma) +
labs(title = "Top 10 - Asistencia Médica", x = "Asistencia", y = "Frecuencia") +
theme_minimal()datos %>%
count(ocur) %>%
drop_na() %>%
mutate(porcentaje = paste0(round(n / sum(n) * 100, 2), "%")) %>%
kable(caption = "Lugar de Ocurrencia",
col.names = c("Lugar", "Frecuencia", "Porcentaje"),
format.args = list(big.mark = ",")) %>%
kable_styling()| Lugar | Frecuencia | Porcentaje |
|---|---|---|
| CENTRO DE SALUD | 2,221 | 0.27% |
| DOMICILIO | 492,727 | 60.88% |
| HOSPITAL PRIVADO | 22,830 | 2.82% |
| HOSPITAL PUBLICO | 167,440 | 20.69% |
| IGNORADO | 45,301 | 5.6% |
| LUGAR DE TRABAJO | 61 | 0.01% |
| OTRO | 11,815 | 1.46% |
| SEGURO SOCIAL | 41,581 | 5.14% |
| VIA PUBLICA | 25,316 | 3.13% |
datos %>%
count(ocur) %>%
drop_na() %>%
ggplot(aes(x = reorder(ocur, n), y = n)) +
geom_col(fill = "purple", alpha = 0.7) +
geom_text(aes(label = comma(n)), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_y_continuous(labels = comma) +
labs(title = "Lugar de Ocurrencia", x = "Lugar", y = "Frecuencia") +
theme_minimal()datos %>%
count(causa_capitulo) %>%
drop_na() %>%
arrange(desc(n)) %>%
mutate(porcentaje = paste0(round(n / sum(n) * 100, 2), "%")) %>%
kable(caption = "Capítulos CIE-10",
col.names = c("Capítulo", "Frecuencia", "Porcentaje"),
format.args = list(big.mark = ",")) %>%
kable_styling()| Capítulo | Frecuencia | Porcentaje |
|---|---|---|
| I | 130,082 | 16.07% |
| J | 91,365 | 11.29% |
| R | 82,219 | 10.16% |
| E | 82,080 | 10.14% |
| X | 80,559 | 9.95% |
| C | 77,191 | 9.54% |
| K | 61,417 | 7.59% |
| A | 35,787 | 4.42% |
| N | 34,084 | 4.21% |
| P | 29,173 | 3.6% |
| W | 21,099 | 2.61% |
| G | 15,760 | 1.95% |
| V | 13,391 | 1.65% |
| Q | 13,247 | 1.64% |
| D | 11,674 | 1.44% |
| U | 8,008 | 0.99% |
| F | 5,744 | 0.71% |
| B | 5,696 | 0.7% |
| Y | 3,123 | 0.39% |
| M | 3,063 | 0.38% |
| O | 3,010 | 0.37% |
| L | 1,463 | 0.18% |
| H | 57 | 0.01% |
datos %>%
count(causa_capitulo) %>%
drop_na() %>%
ggplot(aes(x = reorder(causa_capitulo, n), y = n)) +
geom_col(fill = "darkred", alpha = 0.7) +
geom_text(aes(label = comma(n)), hjust = -0.2, size = 3) +
coord_flip() +
scale_y_continuous(labels = comma) +
labs(title = "Defunciones por Capítulo CIE-10", x = "Capítulo", y = "Frecuencia") +
theme_minimal()datos %>%
filter(!is.na(sexo), !is.na(ocur)) %>%
ggplot(aes(x = sexo, fill = ocur)) +
geom_bar(position = "fill") +
labs(title = "Lugar de Ocurrencia por Sexo",
y = "Proporción", x = "Sexo", fill = "Lugar") +
scale_y_continuous(labels = percent) +
theme_minimal() +
theme(legend.position = "bottom")datos %>%
filter(!is.na(areag), !is.na(asist)) %>%
count(areag, asist) %>%
group_by(areag) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = areag, y = prop, fill = asist)) +
geom_col(position = "fill") +
labs(title = "Asistencia Médica por Área", x = "Área", y = "Proporción", fill = "Asistencia") +
scale_y_continuous(labels = percent) +
theme_minimal() +
theme(legend.position = "bottom")datos %>%
filter(!is.na(depocu), !is.na(sexo)) %>%
count(depocu, sexo) %>%
group_by(depocu) %>%
filter(sum(n) >= 10000) %>%
ungroup() %>%
ggplot(aes(x = reorder(depocu, n), y = n, fill = sexo)) +
geom_col(position = "dodge", alpha = 0.8) +
coord_flip() +
scale_fill_manual(values = c("HOMBRE" = "#2c7bb6", "MUJER" = "#d7191c")) +
scale_y_continuous(labels = comma) +
labs(title = "Defunciones por Departamento y Sexo",
subtitle = "Departamentos con 10,000+ casos",
x = "Departamento", y = "Defunciones") +
theme_minimal() +
theme(legend.position = "top")datos %>%
count(depocu) %>%
drop_na() %>%
arrange(desc(n)) %>%
head(15) %>%
mutate(porcentaje = n / sum(n) * 100) %>%
ggplot(aes(x = reorder(depocu, n), y = n)) +
geom_col(fill = "darkgreen", alpha = 0.8) +
geom_text(aes(label = paste0(comma(n), "\n(", round(porcentaje, 1), "%)")),
hjust = -0.1, size = 3) +
coord_flip() +
scale_y_continuous(labels = comma, limits = c(0, max(datos %>% count(depocu) %>% pull(n), na.rm = TRUE) * 1.15)) +
labs(title = "Top 15 Departamentos", x = "Departamento", y = "Defunciones") +
theme_minimal()set.seed(42)
datos %>%
select(edad_anios, anoreg, diaocu) %>%
drop_na() %>%
slice_sample(n = min(1000, nrow(.))) %>%
ggpairs(aes(alpha = 0.4)) +
theme_minimal()datos %>%
filter(!is.na(anoreg), !is.na(sexo)) %>%
count(anoreg, sexo) %>%
ggplot(aes(x = anoreg, y = n, color = sexo)) +
geom_line(size = 1.5) +
geom_point(size = 3) +
scale_color_manual(values = c("HOMBRE" = "#2c7bb6", "MUJER" = "#d7191c")) +
scale_x_continuous(breaks = seq(min(datos$anoreg, na.rm = TRUE),
max(datos$anoreg, na.rm = TRUE), 1)) +
scale_y_continuous(labels = comma) +
labs(title = "Evolución de Defunciones por Sexo", x = "Año", y = "Defunciones") +
theme_minimal() +
theme(legend.position = "top")Variables Numéricas:
Variables Categóricas:
Relaciones:
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Guatemala
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 kableExtra_1.4.0 stringi_1.8.7 GGally_2.4.0
## [5] janitor_2.2.1 lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
## [9] dplyr_1.1.4 purrr_1.2.1 readr_2.1.6 tidyr_1.3.2
## [13] tibble_3.3.1 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.5.1 hms_1.1.4
## [5] digest_0.6.39 magrittr_2.0.4 evaluate_1.0.5 grid_4.5.2
## [9] timechange_0.3.0 RColorBrewer_1.1-3 fastmap_1.2.0 jsonlite_2.0.0
## [13] viridisLite_0.4.2 textshaping_1.0.4 jquerylib_0.1.4 cli_3.6.5
## [17] crayon_1.5.3 rlang_1.1.7 bit64_4.6.0-1 withr_3.0.2
## [21] cachem_1.1.0 yaml_2.3.12 parallel_4.5.2 tools_4.5.2
## [25] tzdb_0.5.0 ggstats_0.12.0 vctrs_0.7.0 R6_2.6.1
## [29] lifecycle_1.0.5 snakecase_0.11.1 bit_4.6.0 vroom_1.6.7
## [33] pkgconfig_2.0.3 pillar_1.11.1 bslib_0.9.0 gtable_0.3.6
## [37] glue_1.8.0 systemfonts_1.3.1 xfun_0.55 tidyselect_1.2.1
## [41] rstudioapi_0.18.0 knitr_1.51 farver_2.1.2 htmltools_0.5.9
## [45] labeling_0.4.3 rmarkdown_2.30 svglite_2.2.2 compiler_4.5.2
## [49] S7_0.2.1